pip install pmdarima
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: pmdarima in /usr/local/lib/python3.7/dist-packages (2.0.1) Requirement already satisfied: statsmodels>=0.13.2 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (0.13.5) Requirement already satisfied: Cython!=0.29.18,!=0.29.31,>=0.29 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (0.29.32) Requirement already satisfied: numpy>=1.21 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.21.6) Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (57.4.0) Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.24.3) Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.2.0) Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.7.3) Requirement already satisfied: pandas>=0.19 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.3.5) Requirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.0.2) Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19->pmdarima) (2022.6) Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19->pmdarima) (2.8.2) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=0.19->pmdarima) (1.15.0) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.22->pmdarima) (3.1.0) Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.13.2->pmdarima) (0.5.3) Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.13.2->pmdarima) (21.3) Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=21.3->statsmodels>=0.13.2->pmdarima) (3.0.9)
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn import impute
from sklearn import preprocessing
import sklearn.externals
import joblib
from sklearn.model_selection import TimeSeriesSplit
from sklearn.impute import KNNImputer
import sklearn.preprocessing
from keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
import time
import plotly.graph_objects as go
from sklearn import metrics
import statsmodels.api as sm
import statsmodels.tsa.stattools as ts
from statsmodels.tsa.statespace.sarimax import SARIMAX
import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K
%matplotlib inline
import plotly.io as pio
pio.renderers.default='notebook'
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
df = pd.read_csv("/content/drive/Shareddrives/Mineria /Temperatura1.csv", sep=';', header=0, decimal = ',')
Fecha = pd.date_range(start='2017-01-01', end='2021-12-31', freq='D')
df['Fecha'] = Fecha
df = df.set_index('Fecha')
print(df[pd.isnull(df.ValorObservado)])
print('En total hay' ,
str(df['ValorObservado'].isnull().sum()) ,
'valores sin información')
print('Correspondientes al {:.3f}% del total'
.format(df['ValorObservado'].isnull().sum()*100/len(df)))
ValorObservado Fecha 2017-08-12 NaN 2017-12-24 NaN 2019-09-15 NaN 2019-09-16 NaN 2019-09-17 NaN 2020-11-12 NaN 2021-01-05 NaN 2021-01-06 NaN 2021-01-07 NaN 2021-01-08 NaN 2021-08-18 NaN 2021-08-20 NaN 2021-12-05 NaN En total hay 13 valores sin información Correspondientes al 0.712% del total
La serie presenta valores faltantes, por lo tantol se imputaran usando el método de vecinos más cercanos (KNN), como se muestra a continuación.
#Imputación de Valores usando el vecino más cercano
imput = KNNImputer(n_neighbors=5, weights="uniform")
# Ajustamos el modelo e imputamos los missing values
imput.fit(df[['ValorObservado']])
df['ValorObservado'] = imput.transform(df[['ValorObservado']]).ravel()
print()
print("Valores pérdidos en ValorObservado: " ,
str(df['ValorObservado'].isnull().sum()))
Valores pérdidos en ValorObservado: 0
fig = px.line(df, x=df.index, y="ValorObservado")
fig.update_xaxes(title_text="Fecha")
fig.show()
Para el respectivo análisis se tomarán el 80% de los datos para entrenamiento y validacion, el 20% restantes para prueba, dichos valores corresponden a 1460 y 366 respectivamente.
from sklearn.preprocessing import MinMaxScaler
# crea el objeto scaler y escala los datos
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df.values)
#
df_norm = pd.DataFrame(scaled_data,index=df.index, columns=['ValorObservadoNormalizado'])
def create_dataset(X, y, time_steps=1):
# crea dos listas vacias para depositar los datos
Xs, ys = [], []
# el primer lote de datos empieza en la primera observación
# y toma time_steps datos.
# Comienza a avanzar hacia adelante.
for i in range(len(X) - time_steps):
v = X.iloc[i:(i + time_steps)].values
Xs.append(v)
ys.append(y.iloc[i + time_steps])
return np.array(Xs), np.array(ys)
train_size = int(len(df_norm) * 0.8)
test_size = len(df_norm) - train_size
train, test = df_norm.iloc[0:train_size], df_norm.iloc[train_size:len(df_norm)]
len_train = len(train)
len_test = len(test)
print(len_train, len_test)
1460 366
time_steps = 50
# reshape to [samples, time_steps, n_features]
X_train, y_train = create_dataset(train, train, time_steps)
X_test, y_test = create_dataset(test, test, time_steps)
print("X_train.shape = ", X_train.shape)
print("y_train.shape = ", y_train.shape)
print("X_test.shape = ", X_test.shape)
print("y_test.shape = ", y_test.shape)
X_train.shape = (1410, 50, 1) y_train.shape = (1410, 1) X_test.shape = (316, 50, 1) y_test.shape = (316, 1)
fig = px.line(df_norm, x=df.index, y='ValorObservadoNormalizado')
fig.update_xaxes(title_text="Fecha")
fig.update_yaxes(title_text="ValorObservadoNormalizado")
fig.show()
#ARIMA
ARIMA_model = joblib.load('/content/drive/Shareddrives/Mineria /ARIMA_Predict')
ARIMA_final= sm.tsa.statespace.SARIMAX(df[:1460], trend='n', order=(3, 1, 3))
results_final= ARIMA_final.fit(use_boxcox=True)
#SRNN
SRNN_model = joblib.load('/content/drive/Shareddrives/Mineria /SRNN')
#GRU
GRU_model = joblib.load('/content/drive/Shareddrives/Mineria /GRU')
#LSTM
LSTM_model = joblib.load('/content/drive/Shareddrives/Mineria /LSTM')
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/base/tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency D will be used. /usr/local/lib/python3.7/dist-packages/statsmodels/tsa/base/tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency D will be used. /usr/local/lib/python3.7/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters. /usr/local/lib/python3.7/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. /usr/local/lib/python3.7/dist-packages/statsmodels/base/optimizer.py:23: FutureWarning: Keyword arguments have been passed to the optimizer that have no effect. The list of allowed keyword arguments for method lbfgs is: m, pgtol, factr, maxfun, epsilon, approx_grad, bounds, loglike_and_score. The list of unsupported keyword arguments passed include: use_boxcox. After release 0.14, this will raise. /usr/local/lib/python3.7/dist-packages/statsmodels/base/model.py:606: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
#ARIMA
ARIMA_train = results_final.predict(start=1, end=len(train))
#SRNN
SRNN_train = SRNN_model.predict(X_train)
SRNN_train = scaler.inverse_transform(SRNN_train)
#GRU
GRU_train = GRU_model.predict(X_train)
GRU_train = scaler.inverse_transform(GRU_train)
#LSTM
LSTM_train = LSTM_model.predict(X_train)
LSTM_train = scaler.inverse_transform(LSTM_train)
45/45 [==============================] - 0s 5ms/step 45/45 [==============================] - 1s 7ms/step 45/45 [==============================] - 1s 14ms/step
seq_len = 50
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=df.index[seq_len : len(y_train) + seq_len],
y=scaler.inverse_transform(y_train).ravel(),
mode="lines",
name="Entrenamiento",
)
)
fig.add_trace(
go.Scatter(
x=df.index[seq_len : len(y_train) + seq_len],
y=ARIMA_train[45:].ravel(),
mode="lines",
name="ARIMA",
)
)
fig.add_trace(
go.Scatter(
x=df.index[seq_len : len(y_train) + seq_len],
y=SRNN_train.ravel(),
mode="lines",
name="SRNN",
)
)
fig.add_trace(
go.Scatter(
x=df.index[seq_len : len(y_train) + seq_len],
y=GRU_train.ravel(),
mode="lines",
name="GRU",
)
)
fig.add_trace(
go.Scatter(
x=df.index[seq_len : len(y_train) + seq_len],
y=LSTM_train.ravel(),
mode="lines",
name="LSTM",
)
)
fig.update_xaxes(title_text="Fecha")
fig.update_yaxes(title_text="ValorObservado")
fig.show()
#ARIMA
ARIMA_test= ARIMA_model['ARIMA_Predict']
#SRNN
SRNN_test = SRNN_model.predict(X_test)
SRNN_test = scaler.inverse_transform(SRNN_test)
#GRU
GRU_test = GRU_model.predict(X_test)
GRU_test = scaler.inverse_transform(GRU_test)
#LSTM
LSTM_test = LSTM_model.predict(X_test)
LSTM_test = scaler.inverse_transform(LSTM_test)
10/10 [==============================] - 0s 8ms/step 10/10 [==============================] - 0s 12ms/step 10/10 [==============================] - 0s 26ms/step
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=df.index[len(y_train) + seq_len :],
y=scaler.inverse_transform(y_test).ravel(),
mode="lines",
name="Prueba",
)
)
fig.add_trace(
go.Scatter(
x=df.index[len(y_train) + seq_len :],
y=ARIMA_test[45:],
mode="lines",
name="ARIMA",
)
)
fig.add_trace(
go.Scatter(
x=df.index[len(y_train) + seq_len :],
y=SRNN_test.ravel(),
mode="lines",
name="SRNN",
)
)
fig.add_trace(
go.Scatter(
x=df.index[len(y_train) + seq_len :],
y=GRU_test.ravel(),
mode="lines",
name="GRU",
)
)
fig.add_trace(
go.Scatter(
x=df.index[len(y_train) + seq_len :],
y=LSTM_test.ravel(),
mode="lines",
name="LSTM",
)
)
fig.update_xaxes(title_text="Fecha")
fig.update_yaxes(title_text="ValorObservado")
fig.show()
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=df.index[len(y_train) + seq_len :],
y=scaler.inverse_transform(y_test).ravel()-ARIMA_test[45:],
mode="lines",
name="ARIMA",
)
)
fig.add_trace(
go.Scatter(
x=df.index[len(y_train) + seq_len :],
y=scaler.inverse_transform(y_test).ravel()-SRNN_test.ravel(),
mode="lines",
name="SRNN",
)
)
fig.add_trace(
go.Scatter(
x=df.index[len(y_train) + seq_len :],
y=scaler.inverse_transform(y_test).ravel()-GRU_test.ravel(),
mode="lines",
name="GRU",
)
)
fig.add_trace(
go.Scatter(
x=df.index[len(y_train) + seq_len :],
y=scaler.inverse_transform(y_test).ravel()-LSTM_test.ravel(),
mode="lines",
name="LSTM",
)
)
fig.update_xaxes(title_text="Fecha")
fig.update_yaxes(title_text="Error")
fig.show()
| Modelo | SARIMA | SRNN | LSTM | GRU |
| ECM (1 paso adelante) | 0.68 | 0.72305 | 0.680276 | 0.835264 |
| ECM (5 paso adelante) | 0.84 | 1.555407 | 1.494980 | 1.525437 |
| ECM (100 retardos) | 0.695902 | 0.633088 | 0.649107 |